knitr::opts_chunk$set(fig.height = 5)
knitr::opts_chunk$set(fig.width = 8.5)
library(readr)
library(ggplot2)
library(dplyr)
library(methods)
library(stringi)
library(keras)
library(glmnet)
In this project, I construct a transfer learning model to classify a set of images ranging 22 different categories. The data I use is the Caltech101 dataset, from which I select a subset of 22 distinct catergories. The Caltech101 dataset was collected in September 2003 by Fei-Fei Li, Marco Andreetto, and Marc ’Aurelio Ranzato.
The following is a precomputed summary of how many images are in each category:
accordion 55 anchor 42 ant 42 bass 54 beaver 46 binocular 33 brain 98 brontosaurus 43 buddha 85 dalmatian 67 electric_guitar 75 inline_skate 31 lamp 61 octopus 35 pagoda 47 panda 38 stop_sign 64 strawberry 35 sunflower 85 tick 49 wrench 39 yin_yang 60
Since this is a transfer learning model, I utilized a pre-trained deep neural net with many convolutional layers called resnet50. This model was trained by Kaiming He at Microsoft Reserch Labs in 2016. Their model has substantially more layers than previously used models. Resnet50 was trained on the ImageNet dataset with up to 152 layers, which is 8x deeper than VGG nets while still having lower complexity.
First, I load in the resnet50 model, take it’s second to last layer, and crete an embedding matrix from my image data. This includes picture metadeta, as well as an embedding matrix with the output from the layers I used from resnet50.
resnet50 <- application_resnet50(weights = 'imagenet', include_top = TRUE)
model_avg_pool <- keras_model(inputs = resnet50$input,
outputs = get_layer(resnet50, 'avg_pool')$output)
Next, I load in my images and partition them among their classifications.
input_dir <- "101_ObjectCategories/"
image_paths <- dir(input_dir, recursive = TRUE)
ext <- stri_match(image_paths, regex = "\\.([A-Za-z]+$)")[,2]
image_paths <- image_paths[stri_trans_tolower(ext) %in% c("jpg", "png", "jpeg")]
class_vector <- dirname(image_paths)
class_names <- levels(factor(class_vector))
n <- length(class_vector)
Z <- array(0, dim = c(n, 224, 224, 3))
y <- as.numeric(factor(class_vector)) - 1L
for (i in seq_len(n))
{
pt <- file.path(input_dir, image_paths[i])
image <- image_to_array(image_load(pt, target_size = c(224,224)))
Z[i,,,] <- array_reshape(image, c(1, dim(image)))
}
set.seed(1)
index <- sample(seq_len(nrow(Z)))
Z <- Z[index,,,]
y <- y[index]
X <- predict(model_avg_pool, x = imagenet_preprocess_input(Z), verbose = TRUE)
X <- array(X, dim = c(1184, 2048))
dim(X)
## [1] 1184 2048
Now, I partition my feature matrix into a training set and validation set to use for training my own model. This is a 60/40 split.
train_id <- sample(c("train", "valid"), nrow(X), TRUE, prob = c(0.6, 0.4))
X_train <- X[train_id == "train",] # Note: X is a matrix
y_train <- to_categorical(y[train_id == "train"])
Next, I create a neural network of dense layers to be trained on top of the convolutional layers taken from resnet50. Each layer has 256 neurons and use a dropout rate of 50%.
model <- keras_model_sequential()
model %>%
layer_dense(units = 256, input_shape = ncol(X_train)) %>%
layer_activation(activation = "relu") %>%
layer_dropout(rate = 0.5) %>%
layer_dense(units = 256) %>%
layer_activation(activation = "relu") %>%
layer_dropout(rate = 0.5) %>%
layer_dense(units = ncol(y_train)) %>%
layer_activation(activation = "softmax")
model %>% compile(loss = 'categorical_crossentropy',
optimizer = optimizer_rmsprop(lr = 0.001 / 2),
metrics = c('accuracy'))
history <- model %>%
fit(X_train, y_train, epochs = 8)
plot(history)
y_pred <- predict_classes(model, X)
tapply(y == y_pred, train_id, mean)
## train valid
## 0.9944979 0.9431072
The model results in a ~94% succeess rate on classifying images in the validation set; pretty good!
table(value = class_names[y + 1L], prediction = class_names[y_pred + 1L], train_id)
## , , train_id = train
##
## prediction
## value accordion anchor ant bass beaver binocular brain
## accordion 33 0 0 0 0 0 0
## anchor 0 25 0 0 0 0 0
## ant 0 0 29 0 0 0 0
## bass 0 0 0 29 0 0 0
## beaver 0 0 0 0 31 0 0
## binocular 0 0 0 0 0 19 0
## brain 0 0 0 0 0 0 63
## brontosaurus 0 0 0 0 0 0 0
## buddha 0 0 0 0 0 0 0
## dalmatian 0 0 0 0 0 0 0
## electric_guitar 0 0 0 0 0 0 0
## inline_skate 0 0 0 0 0 0 0
## lamp 0 0 0 0 0 0 0
## octopus 0 0 1 0 0 0 0
## pagoda 0 0 0 0 0 0 0
## panda 0 0 0 0 0 0 0
## stop_sign 0 0 0 0 0 0 0
## strawberry 0 0 0 0 0 0 0
## sunflower 0 0 0 0 0 0 0
## tick 0 0 0 0 0 0 0
## wrench 0 0 0 0 0 0 0
## yin_yang 0 0 0 0 0 0 0
## prediction
## value brontosaurus buddha dalmatian electric_guitar
## accordion 0 0 0 0
## anchor 0 0 0 0
## ant 0 0 0 0
## bass 0 0 0 0
## beaver 0 0 0 0
## binocular 0 0 0 0
## brain 0 0 0 0
## brontosaurus 32 0 0 0
## buddha 0 49 0 0
## dalmatian 0 0 43 0
## electric_guitar 0 0 0 47
## inline_skate 0 0 0 0
## lamp 0 0 0 0
## octopus 1 0 0 0
## pagoda 0 0 0 0
## panda 0 0 0 0
## stop_sign 0 0 0 0
## strawberry 0 0 0 0
## sunflower 0 0 0 0
## tick 0 0 0 0
## wrench 0 0 0 1
## yin_yang 0 0 0 0
## prediction
## value inline_skate lamp octopus pagoda panda stop_sign
## accordion 0 0 0 0 0 0
## anchor 0 0 0 0 0 0
## ant 0 0 0 0 0 0
## bass 0 0 0 1 0 0
## beaver 0 0 0 0 0 0
## binocular 0 0 0 0 0 0
## brain 0 0 0 0 0 0
## brontosaurus 0 0 0 0 0 0
## buddha 0 0 0 0 0 0
## dalmatian 0 0 0 0 0 0
## electric_guitar 0 0 0 0 0 0
## inline_skate 13 0 0 0 0 0
## lamp 0 37 0 0 0 0
## octopus 0 0 22 0 0 0
## pagoda 0 0 0 24 0 0
## panda 0 0 0 0 27 0
## stop_sign 0 0 0 0 0 44
## strawberry 0 0 0 0 0 0
## sunflower 0 0 0 0 0 0
## tick 0 0 0 0 0 0
## wrench 0 0 0 0 0 0
## yin_yang 0 0 0 0 0 0
## prediction
## value strawberry sunflower tick wrench yin_yang
## accordion 0 0 0 0 0
## anchor 0 0 0 0 0
## ant 0 0 0 0 0
## bass 0 0 0 0 0
## beaver 0 0 0 0 0
## binocular 0 0 0 0 0
## brain 0 0 0 0 0
## brontosaurus 0 0 0 0 0
## buddha 0 0 0 0 0
## dalmatian 0 0 0 0 0
## electric_guitar 0 0 0 0 0
## inline_skate 0 0 0 0 0
## lamp 0 0 0 0 0
## octopus 0 0 0 0 0
## pagoda 0 0 0 0 0
## panda 0 0 0 0 0
## stop_sign 0 0 0 0 0
## strawberry 17 0 0 0 0
## sunflower 0 51 0 0 0
## tick 0 0 26 0 0
## wrench 0 0 0 22 0
## yin_yang 0 0 0 0 40
##
## , , train_id = valid
##
## prediction
## value accordion anchor ant bass beaver binocular brain
## accordion 22 0 0 0 0 0 0
## anchor 0 12 0 0 0 0 0
## ant 0 0 10 0 0 0 0
## bass 0 0 0 20 0 0 1
## beaver 0 0 0 0 15 0 0
## binocular 0 0 0 0 0 14 0
## brain 0 0 0 0 0 0 35
## brontosaurus 0 0 0 0 0 0 0
## buddha 0 0 0 0 0 0 0
## dalmatian 0 0 0 0 0 0 0
## electric_guitar 0 0 1 0 0 0 0
## inline_skate 0 0 0 0 0 1 0
## lamp 0 0 0 0 0 0 0
## octopus 0 0 0 0 0 0 0
## pagoda 0 0 0 0 0 0 0
## panda 0 0 0 0 0 0 0
## stop_sign 0 0 0 0 0 0 0
## strawberry 0 0 0 0 0 0 0
## sunflower 0 0 0 0 0 0 0
## tick 0 0 1 0 0 0 0
## wrench 0 1 0 0 0 0 0
## yin_yang 0 0 0 0 0 0 0
## prediction
## value brontosaurus buddha dalmatian electric_guitar
## accordion 0 0 0 0
## anchor 2 1 0 1
## ant 1 0 0 0
## bass 1 0 0 1
## beaver 0 0 0 0
## binocular 0 0 0 0
## brain 0 0 0 0
## brontosaurus 11 0 0 0
## buddha 0 36 0 0
## dalmatian 0 0 24 0
## electric_guitar 0 0 0 27
## inline_skate 0 0 0 0
## lamp 0 0 0 0
## octopus 4 2 0 0
## pagoda 0 0 0 0
## panda 0 0 0 0
## stop_sign 0 1 0 0
## strawberry 0 0 0 0
## sunflower 0 0 0 0
## tick 0 0 0 0
## wrench 0 0 0 1
## yin_yang 0 0 0 0
## prediction
## value inline_skate lamp octopus pagoda panda stop_sign
## accordion 0 0 0 0 0 0
## anchor 0 0 1 0 0 0
## ant 0 0 0 0 0 1
## bass 0 0 1 0 0 0
## beaver 0 0 0 0 0 0
## binocular 0 0 0 0 0 0
## brain 0 0 0 0 0 0
## brontosaurus 0 0 0 0 0 0
## buddha 0 0 0 0 0 0
## dalmatian 0 0 0 0 0 0
## electric_guitar 0 0 0 0 0 0
## inline_skate 17 0 0 0 0 0
## lamp 0 24 0 0 0 0
## octopus 0 0 4 0 0 0
## pagoda 0 0 0 23 0 0
## panda 0 0 0 0 11 0
## stop_sign 0 0 0 0 0 19
## strawberry 0 0 0 0 0 0
## sunflower 0 0 0 0 0 0
## tick 0 0 0 0 0 0
## wrench 0 0 0 0 0 0
## yin_yang 0 1 0 0 0 0
## prediction
## value strawberry sunflower tick wrench yin_yang
## accordion 0 0 0 0 0
## anchor 0 0 0 0 0
## ant 0 0 1 0 0
## bass 0 0 0 0 0
## beaver 0 0 0 0 0
## binocular 0 0 0 0 0
## brain 0 0 0 0 0
## brontosaurus 0 0 0 0 0
## buddha 0 0 0 0 0
## dalmatian 0 0 0 0 0
## electric_guitar 0 0 0 0 0
## inline_skate 0 0 0 0 0
## lamp 0 0 0 0 0
## octopus 0 0 1 0 0
## pagoda 0 0 0 0 0
## panda 0 0 0 0 0
## stop_sign 0 0 0 0 0
## strawberry 18 0 0 0 0
## sunflower 0 34 0 0 0
## tick 0 0 22 0 0
## wrench 0 0 0 14 0
## yin_yang 0 0 0 0 19
In most cases, no two classes were confused for more than a few examples, and the model is pretty impressively accurate. This probably has to do with the dataset being fairly small, and the pre-trained convolutions being such a powerful tool that it can pick out features despite the relatively small number of examples.
Some common mistakes included confusion between electric guitars and anchors, and between ants and ticks.
Anchors were the most misclassified images, with a total of 6 misclassifications. They were not consistently misclassified to a specfifc class, but instead were confused basically equally among electric guitars, octopi, and accordians.
My model only misclassified 36 of the 1184 images. Many of the trouble spots seemed be among the animals. Ticks, ants, and octopi were the most confusing for the model. Looking at the images, it would not be trivial for a human to decide whether each ant was really an ant and not a tick, and vice versa.
Here is a visual of all the misclassified data, with the incorrect prediction superimposed on the image.
id <- which(y_pred != y)
length(id)
## [1] 30
par(mfrow = c(6, 6))
for (i in id) {
par(mar = rep(0, 4L))
plot(0,0,xlim=c(0,1),ylim=c(0,1),axes= FALSE,type = "n")
rasterImage(Z[i,,,] /255,0,0,1,1)
text(0.5, 0.1, label = class_names[y_pred[i] + 1L], col = "red", cex=2)
}
I see a few examples where I do not blame the model for being incorrect, including certain examples where it is hard for even a human to tell which class the image belongs to. Other understandable errors are misclassifications between ants, ticks, and octopi, which all have long tendrilly legs.
Now, I display the top 5 images from each class in order of decreasing confidence as determined by the model.
y_probs <- predict(model, X)
m_row <- 22
id_list <- list()
for (t in class_names[1:m_row]){
type <- t
# which are the maximum probs?
id <- order(y_probs[,which(class_names == type)], decreasing = TRUE)[1:5]
id_list <- append(id_list, list(id))
}
par(mfrow = c(m_row,5))
for (id in id_list) {
for (i in id) {
par(mar = c(0,0,0,0))
plot(0,0,xlim=c(0,1),ylim=c(0,1),axes= FALSE,type = "n", asp = 1)
rasterImage(Z[i,,,] /255,0,0,1,1)
}
}
All these examples have clear and characteristic features of each class, with one exception in my opinion: octopus. The 5 examples are close to identical, and could explain why the more natural images of octopi in the dataset were one of the most misclassified in total.
Here I visualize the embedding itself using principle components. One interesting cluster that falls out is the lower right, where pagodas and accordians seem to be learned as similar-looking. This makes sense, as they both are clearly segmented objects.
pca <- as_tibble(prcomp(X)$x[,1:22])
pca$y <- class_names[y + 1L]
ggplot(pca, aes(PC1, PC2)) +
geom_point(aes(color = y), alpha = 1, size = 2) +
labs(x = "", y = "", color = "class") +
theme_minimal()
Finally, I use glmnet instead of a neural network on top of the convolutional layers for multinomial logistic regression. The model performs just as well as the dense neural network, and even slightly outperforms it.
model_glm <- cv.glmnet(X_train, y_train, family = "multinomial")
y_pred_glm <- predict(model_glm, newx = X, type = "class")
y_pred_glm <- as.integer(y_pred_glm)
y_pred_glm <- y_pred_glm - 1
Here, I calculate the classification rate for the glmnet predictions.
# Classification rate.
sum <- 0
for (i in 1:length(y)) {
if (y[i] == y_pred_glm[i]) {
sum <- sum + 1
}
}
sum/length(y)
## [1] 0.9755068
Finally, I visualize all of the misclassified images, with the incorrect predictions superimposed.
id <- which(y_pred_glm != y)
length(id)
## [1] 29
par(mfrow = c(6, 6))
for (i in id) {
par(mar = rep(0, 4L))
plot(0,0,xlim=c(0,1),ylim=c(0,1),axes= FALSE,type = "n")
rasterImage(Z[i,,,] /255,0,0,1,1)
text(0.5, 0.1, label = class_names[y_pred_glm[i] + 1L], col = "red", cex=2)
}
There is a subfield of ML focused on creating adversarial examples to various models.
The idea is that you can take an image, tweak the pixels in a way imperceptible to humans, and trick the model into classifying the image completely incorrectly.
Is there a way to manipulate the input data slightly that will trick our model into misclassifying the new image? Here, I tried to manipulate the data in a couple of different ways. Instead of really creating true adversarial examples, I think it somehow speaks to the robustness of the model itself.
I try a number of different techniques, and show which tweaks create the most error in the model.
First, I try a strategy of randomly adding 2 to 100 different features for each image.
X_adv <- X
for(row in 1:nrow(X_adv)) {
temp <- sample.int(1184, 100)
for (i in temp) {
X_adv[row, i] <- X_adv[row, i] + 2
}
}
y_pred_adv <- predict_classes(model, X_adv)
tapply(y == y_pred_adv, train_id, mean)
## train valid
## 0.9834938 0.9409190
Given this change, the model still performs really well, losing just a few percentage points.
Next, I try a strategy of randomly dropping 600 features to 0. Do you think that the model will predict better or worse than incrememnting 100 random features by 2?
X_adv <- X
for(row in 1:nrow(X_adv)) {
temp <- sample.int(1184, 600)
for (i in temp) {
X_adv[row, i] <- 0
}
}
y_pred_adv_1 <- predict_classes(model, X_adv)
tapply(y == y_pred_adv_1, train_id, mean)
## train valid
## 0.9903714 0.9431072
These strategies are just simple ways to stress test the model, and as you can see the model’s accuracy still holds up very well. To create a truly adversarial example, where the classification is very confident yet very wrong, a bit more sophistication is needed to use the model and gradient descent to figure out how to tweak the picture in the way you want.